Imputation

source("gamlss_mice.R", local = knitr::knit_global())

## 
##  Variables sorted by number of missings: 
##  Variable     Count
##       hgt 0.2009612
##       sex 0.2008611
##       age 0.1974567
##       reg 0.1968559
##       wgt 0.1935516
## 

## 
##  iter imp variable
##   1   1  wgt  reg  age  sex  hgt
##   1   2  wgt  reg  age  sex  hgt

NAs stats in missing dataset

wgt_nas <- plot_na_pie("wgt")

## [1] 1933

hgt_nas <- plot_na_pie("hgt")

## [1] 2007

age_nas <- plot_na_pie("age")

## [1] 1972

RF: Wight

rf:compare the imputed datasets with orignal dataset

df_rf_wgt <- create_compare_data(data,miss_data,impt_mice_rf_data,nas=wgt_nas,
                                   col = "wgt",method = "rf",sp_impt="method")
ggplot(df_rf_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_wgt, aes(source,wgt, colour = source))+geom_boxplot()

ggplot(df_rf_wgt, aes(source,wgt, colour = source))+geom_boxplot(aes(colour=sex))

RF:compare split with Sex

df_rf_wgt <- create_compare_data(data,miss_data,impt_mice_rf_data,nas=wgt_nas,col = "wgt",method = "rf",sp_impt="sex")
ggplot(df_rf_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_wgt, aes(source,wgt, colour = source))+geom_boxplot()

RF:compare by NA counts

ggplot(df_rf_wgt, aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_wgt, aes(na_count,wgt, colour = sex))+geom_boxplot()

ggplot(df_rf_wgt[grepl("4:|True",df_rf_wgt$na_count),], aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

GAMLSS: Wight

GAMLSS:compare the imputed datasets with orignal dataset

df_gamlss_wgt <- create_compare_data(data,miss_data,impt_mice_gamlss_data,nas=wgt_nas,
                                   col = "wgt",method = "cart",sp_impt="method")
ggplot(df_gamlss_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_wgt, aes(source,wgt, colour = source))+geom_boxplot()

ggplot(df_gamlss_wgt, aes(source,wgt, colour = source))+geom_boxplot(aes(colour=sex))

GAMLSS:compare split with Sex

df_gamlss_wgt <- create_compare_data(data,miss_data,impt_mice_gamlss_data,nas=wgt_nas,col = "wgt",method = "cart",sp_impt="sex")
ggplot(df_gamlss_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_wgt, aes(source,wgt, colour = source))+geom_boxplot()

GAMLSS:compare by NA counts

ggplot(df_gamlss_wgt, aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_wgt, aes(na_count,wgt, colour = sex))+geom_boxplot()

ggplot(df_gamlss_wgt[grepl("4:|True",df_gamlss_wgt$na_count),], aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

CART: Wight

CART:compare the imputed datasets with orignal dataset

df_cart_wgt <- create_compare_data(data,miss_data,impt_mice_cart_data,nas=wgt_nas,
                                   col = "wgt",method = "cart",sp_impt="method")
ggplot(df_cart_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_wgt, aes(source,wgt, colour = source))+geom_boxplot()

ggplot(df_cart_wgt, aes(source,wgt, colour = source))+geom_boxplot(aes(colour=sex))

CART:compare split with Sex

df_cart_wgt <- create_compare_data(data,miss_data,impt_mice_cart_data,nas=wgt_nas,col = "wgt",method = "cart",sp_impt="sex")
ggplot(df_cart_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_wgt, aes(source,wgt, colour = source))+geom_boxplot()

CART:compare by NA counts

ggplot(df_cart_wgt, aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_wgt, aes(na_count,wgt, colour = sex))+geom_boxplot()

ggplot(df_cart_wgt[grepl("4:|True",df_cart_wgt$na_count),], aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

RF: Height

rf:compare the imputed datasets with orignal dataset

df_rf_hgt <- create_compare_data(data,miss_data,impt_mice_rf_data,nas=hgt_nas,
                                   col = "hgt",method = "rf",sp_impt="method")
ggplot(df_rf_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_hgt, aes(source,hgt, colour = source))+geom_boxplot()

ggplot(df_rf_hgt, aes(source,hgt, colour = source))+geom_boxplot(aes(colour=sex))

RF:compare split with Sex

df_rf_hgt <- create_compare_data(data,miss_data,impt_mice_rf_data,nas=hgt_nas,col = "hgt",method = "rf",sp_impt="sex")
ggplot(df_rf_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_hgt, aes(source,hgt, colour = source))+geom_boxplot()

RF:compare by NA counts

ggplot(df_rf_hgt, aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_hgt, aes(na_count,hgt, colour = sex))+geom_boxplot()

ggplot(df_rf_hgt[grepl("4:|True",df_rf_hgt$na_count),], aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

GAMLSS: Height

GAMLSS:compare the imputed datasets with orignal dataset

df_gamlss_hgt <- create_compare_data(data,miss_data,impt_mice_gamlss_data,nas=hgt_nas,
                                   col = "hgt",method = "cart",sp_impt="method")
ggplot(df_gamlss_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_hgt, aes(source,hgt, colour = source))+geom_boxplot()

ggplot(df_gamlss_hgt, aes(source,hgt, colour = source))+geom_boxplot(aes(colour=sex))

GAMLSS:compare split with Sex

df_gamlss_hgt <- create_compare_data(data,miss_data,impt_mice_gamlss_data,nas=hgt_nas,col = "hgt",method = "cart",sp_impt="sex")
ggplot(df_gamlss_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_hgt, aes(source,hgt, colour = source))+geom_boxplot()

GAMLSS:compare by NA counts

ggplot(df_gamlss_hgt, aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_hgt, aes(na_count,hgt, colour = sex))+geom_boxplot()

ggplot(df_gamlss_hgt[grepl("4:|True",df_gamlss_hgt$na_count),], aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

CART: Height

CART:compare the imputed datasets with orignal dataset

df_cart_hgt <- create_compare_data(data,miss_data,impt_mice_cart_data,nas=hgt_nas,
                                   col = "hgt",method = "cart",sp_impt="method")
ggplot(df_cart_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_hgt, aes(source,hgt, colour = source))+geom_boxplot()

ggplot(df_cart_hgt, aes(source,hgt, colour = source))+geom_boxplot(aes(colour=sex))

CART:compare split with Sex

df_cart_hgt <- create_compare_data(data,miss_data,impt_mice_cart_data,nas=hgt_nas,col = "hgt",method = "cart",sp_impt="sex")
ggplot(df_cart_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_hgt, aes(source,hgt, colour = source))+geom_boxplot()

CART:compare by NA counts

ggplot(df_cart_hgt, aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_hgt, aes(na_count,hgt, colour = sex))+geom_boxplot()

ggplot(df_cart_hgt[grepl("4:|True",df_cart_hgt$na_count),], aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

compare miss to true data:wgt

miss_index <- which(is.na(miss_data$wgt))
for (i in 1:10){
  sex <- factor(data$sex[miss_index])
  g1 <- qplot(data$wgt[miss_index],impt_mice_rf_data[[3]]$wgt[miss_index],col=sex)+stat_smooth()+ylim(-10, 105)+
    ylab("rf wgt") + xlab("data wgt")+theme(legend.position = "top")
  
  g2 <- qplot(data$wgt[miss_index],impt_mice_gamlss_data[[3]]$wgt[miss_index],col=sex)+stat_smooth()+ylim(-10, 105)+
    ylab("gamlss wgt") + xlab("data wgt")+theme(legend.position = "top")
  
  g3 <- qplot(data$wgt[miss_index],impt_mice_cart_data[[3]]$wgt[miss_index],col=sex)+stat_smooth()+ylim(-10, 105)+
    ylab("cart wgt") + xlab("data wgt")+theme(legend.position = "top")
  grid.arrange(g1, g2,g3, ncol=3)
  
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

compare miss to true data:hgt

miss_index <- which(is.na(miss_data$hgt))
for (i in 1:10){
  sex <- factor(data$sex[miss_index])
  g1 <- qplot(data$hgt[miss_index],impt_mice_rf_data[[3]]$hgt[miss_index],col=sex)+stat_smooth()+ylim(30, 215)+
    ylab("rf hgt") + xlab("data hgt")+theme(legend.position = "top")
  
  g2 <- qplot(data$hgt[miss_index],impt_mice_gamlss_data[[3]]$hgt[miss_index],col=sex)+stat_smooth()+ylim(30, 215)+
    ylab("gamlss hgt") + xlab("data hgt")+theme(legend.position = "top")
  
  g3 <- qplot(data$hgt[miss_index],impt_mice_cart_data[[3]]$hgt[miss_index],col=sex)+stat_smooth()+ylim(30, 215)+
    ylab("cart hgt") + xlab("data hgt")+theme(legend.position = "top")
  grid.arrange(g1, g2,g3, ncol=3)
  
}

compare miss to true data:age

miss_index <- which(is.na(miss_data$age))
for (i in 1:10){
  sex <- factor(data$sex[miss_index])
  g1 <- qplot(data$age[miss_index],impt_mice_rf_data[[3]]$age[miss_index],col=sex)+stat_smooth()+ylim(-5,22)+
    ylab("rf age") + xlab("data age")+theme(legend.position = "top")
  
  g2 <- qplot(data$age[miss_index],impt_mice_gamlss_data[[3]]$age[miss_index],col=sex)+stat_smooth()+ylim(-5,22)+
    ylab("gamlss age") + xlab("data age")+theme(legend.position = "top")
  
  g3 <- qplot(data$age[miss_index],impt_mice_cart_data[[3]]$age[miss_index],col=sex)+stat_smooth()+ylim(-5,22)+
    ylab("cart age") + xlab("data age")+theme(legend.position = "top")
  grid.arrange(g1, g2,g3, ncol=3)
  
}